Fancy Histograms!

We’ll start by making some histograms.

#install.packages("dslabs")
library(dslabs) 
data(heights)
glimpse(heights)
## Rows: 1,050
## Columns: 2
## $ sex    <fct> Male, Male, Male, Male, Male, Female, Female, Female, Female, M…
## $ height <dbl> 75, 70, 68, 74, 61, 65, 66, 62, 66, 67, 72, 72, 69, 68, 69, 66,…

This data is the heights of humans, divided by their biological sex.

Use ggplot to make a histogram of all of the heights:

library(ggplot2)

ggplot(heights, aes(x = height)) +
  geom_histogram(binwidth = 1, color = "brown", fill = "pink") +
  labs(
    title = "Human Heights",
    x = "Height (inches)",
    y = "Count"
  ) +
  theme_minimal()

Change up the binwidth and see how the plots change. Try 1, 5, 10, and 20

# 1
ggplot(heights, aes(x = height)) +
  geom_histogram(binwidth = 1, color = "purple", fill = "yellow") +
  labs(title = "Histogram of Heights (Binwidth = 1)", x = "Height (inches)", y = "Count") +
  theme_minimal()

# 5
ggplot(heights, aes(x = height)) +
  geom_histogram(binwidth = 5, color = "red", fill = "blue") +
  labs(title = "Histogram of Heights (Binwidth = 5)", x = "Height (inches)", y = "Count") +
  theme_minimal()

# 10
ggplot(heights, aes(x = height)) +
  geom_histogram(binwidth = 10, color = "green", fill = "salmon") +
  labs(title = "Histogram of Heights (Binwidth = 10)", x = "Height (inches)", y = "Count") +
  theme_minimal()

# 20
ggplot(heights, aes(x = height)) +
  geom_histogram(binwidth = 20, color = "black", fill = "lightblue") +
  labs(title = "Histogram of Heights (Binwidth = 20)", x = "Height (inches)", y = "Count") +
  theme_minimal()

Smooth this out to an emperical density with geom_density()

ggplot(heights, aes(x = height)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "orange", color = "blue") +
  geom_density(color = "black", size = 1) +
  labs(
    title = "Density Overlay",
    x = "Height (inches)",
    y = "Density"
  ) +
  theme_minimal() 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Use a new argument in the aes(), group = to split this density by sex

ggplot(heights, aes(x = height, group = sex, color = sex, fill = sex)) +
  geom_density(alpha = 0.6) +
  labs(
    title = "Density of Heights by Sex",
    x = "Height (inches)",
    y = "Density"
  ) +
  theme_minimal()

OR we can do it with color or fill. If you say you want to color by sex, R knows that you want a different curve for each of them.

ggplot(heights, aes(x = height, group = sex, fill = sex)) +
  geom_density(alpha = 0.6) +
  labs(
    title = "Density of Heights by Sex",
    x = "Height (inches)",
    y = "Density"
  ) +
  theme_minimal()

If you’ve used fill, then there is now a slight issue that they are overlapped. We can fix this with alpha transparency!

ggplot(heights, aes(x = height, fill = sex, group = sex)) +
  geom_density(alpha = 0.4) +
  labs(
    title = "Density of Heights by Sex",
    x = "Height (inches)",
    y = "Density"
  ) +
  theme_minimal()

Let’s make some boxplots of the same information.

ggplot(heights, aes(x = sex, y = height, fill = sex)) +
  geom_boxplot() +
  labs(
    title = "Boxplot of Heights by Sex",
    x = "Sex",
    y = "Height (inches)"
  ) +
  theme_minimal()

Quantatitive summaries:

Find the mean and median overall.

mean(heights$height)
## [1] 68.32301
median(heights$height)
## [1] 68.5

Find the mean and median for both groups.

library(dplyr)

heights %>%
  group_by(sex) %>%
  summarise(
    mean_height = mean(height),
    median_height = median(height),
    .groups = "drop"
  )
## # A tibble: 2 × 3
##   sex    mean_height median_height
##   <fct>        <dbl>         <dbl>
## 1 Female        64.9          65.0
## 2 Male          69.3          69

How tall is the tallest woman? How short is the shortest man?

# Tallest female
heights %>%
  filter(sex == "Female") %>%
  summarise(tallest_woman = max(height))
##   tallest_woman
## 1            79
# Shortest male
heights %>%
  filter(sex == "Male") %>%
  summarise(shortest_man = min(height))
##   shortest_man
## 1           50

Presidental Elections Data

# install.packages("pscl")
library(pscl) # loads in the package that has this data. 
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
## You might need to install this...

# data for presidental elections
votedata <-  presidentialElections
glimpse(votedata)
## Rows: 1,097
## Columns: 4
## $ state   <chr> "Alabama", "Arizona", "Arkansas", "California", "Colorado", "C…
## $ demVote <dbl> 84.76, 67.03, 86.27, 58.41, 54.81, 47.40, 48.11, 74.49, 91.60,…
## $ year    <int> 1932, 1932, 1932, 1932, 1932, 1932, 1932, 1932, 1932, 1932, 19…
## $ south   <lgl> TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FAL…

Let’s look at the democratic vote by state for 2000. We can’t use geom_bar for a bar chart, since we have the category in one variable and the “height” of the bar in another. We need geom_col()

Make a bar graph of the democratic vote by state in 2000.

votedata_2000 <- votedata %>%
  filter(year == 2000)
ggplot(votedata_2000, aes(x = reorder(state, demVote), y = demVote)) +
  geom_col(fill = "blue") +
  coord_flip() +  
  labs(
    title = "Democratic Vote by State (2000)",
    x = "State",
    y = "Democratic Vote (%)"
  ) +
  theme_minimal()

Well this looks awful. We have two options: swap the x and y or the more fun sounding… Coordinate flip!

Use coord_flip() on the previous graph to make it better.

ggplot(votedata_2000, aes(x = reorder(state, demVote), y = demVote)) +
  geom_col(fill = "blue") +
  coord_flip() +  
  labs(
    title = "Democratic Vote by State (2000)",
    x = "State",
    y = "Democratic Vote (%)"
  ) +
  theme_minimal()

I don’t love the squashed together coordinates, but it’s a display window issue.

So. This is a helpful graph, but it would be more helpful if it was ordered. Use x = reorder(x_variable, y_variable) in aes() to order the x variable by the y variable

ggplot(votedata_2000, aes(x = reorder(state, demVote), y = demVote)) +
  geom_col(fill = "blue") +
  coord_flip() +
  labs(
    title = "Democratic Vote by State (2000)",
    x = "State (ordered by Democratic vote %)",
    y = "Democratic Vote (%)"
  ) +
  theme_minimal()

So, what if I want to see what the north and south states did different?

start with a facet_wrap using the south variable:

ggplot(votedata_2000, aes(x = reorder(state, demVote), y = demVote)) +
  geom_col(fill = "blue") +
  coord_flip() +
  facet_wrap(~ south) +
  labs(
    title = "Democratic Vote by State (2000)",
    x = "State (ordered by Democratic vote %)",
    y = "Democratic Vote (%)"
  ) +
  theme_minimal()

Okay, that’s not great. Lets color by south instead.

ggplot(votedata_2000, aes(x = reorder(state, demVote), y = demVote, fill = south)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Democratic Vote by State (2000)",
    x = "State (ordered by Democratic vote %)",
    y = "Democratic Vote (%)",
    fill = "Southern State"
  ) +
  theme_minimal()

I’m a good data scientist, so I want my plot to have a name! and my axes to have lables! Use labs to add a title, subtitle, and x and y labels.

ggplot(votedata_2000, aes(x = reorder(state, demVote), y = demVote, fill = south)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Democratic Vote by State in the 2000 United States Presidential Election",
    subtitle = "States colored by whether they are in the South",
    x = "State (ordered by Democratic vote %)",
    y = "Democratic Vote (%)",
    fill = "Southern State"
  ) +
  theme_minimal()

You can move the legend with theme(legend.position = "bottom")

ggplot(votedata_2000, aes(x = reorder(state, demVote), y = demVote, fill = south)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Democratic Vote by State in the 2000 United States Presidential Election",
    subtitle = "States colored by whether they are in the South",
    x = "State (ordered by Democratic vote %)",
    y = "Democratic Vote (%)",
    fill = "Southern State"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

What else could we facet by? years! Let’s filter to year in 2008 and 2016, then facet by years.

library(dplyr)
library(ggplot2)

votedata_filtered <- votedata %>%
  filter(year %in% c(2008, 2016))

ggplot(votedata_filtered, aes(x = reorder(state, demVote), y = demVote, fill = south)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ year) +
  labs(
    title = "Democratic Vote by State (2008 vs 2016)",
    subtitle = "States colored by whether they are in the South",
    x = "State (ordered by Democratic vote %)",
    y = "Democratic Vote (%)",
    fill = "Southern State"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

We need to know who won! We could add a vertical line at 50 for who got more, to indicate the majority of votes. Adding the layer geom_hline() adds a horizontal line. (What do you guess geom_vline() would do?)

ggplot(votedata_filtered, aes(x = reorder(state, demVote), y = demVote, fill = south)) +
  geom_col() +
  geom_hline(yintercept = 50, linetype = "dashed", color = "red", size = 1) +  
  coord_flip() +
  facet_wrap(~ year) +
  labs(
    title = "Democratic Vote by State (2008 vs 2016)",
    subtitle = "Dashed red line indicates 50%",
    x = "State (ordered by Democratic vote %)",
    y = "Democratic Vote (%)",
    fill = "Southern State"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

Getting fancy with a map!

When using geom_polygon or geom_map, you will typically need two data frames:

  • one contains the coordinates of each polygon (positions)
  • the other the values associated with each polygon (values).

An id variable links the two together.

Run the below code to get a map graph.

library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
votedata$state <- tolower(votedata$state)  ## states need to be lowercase for linking

states_map <- map_data("state") ## this gives us the lat and long for each point of each state.
  
map_plot <-  ggplot(data =  votedata %>% filter(year == 2008), aes(map_id = state)) +
    geom_map(aes(fill = demVote), map = states_map) +
    expand_limits(x = states_map$long, y = states_map$lat)
map_plot  

map_plot <-  ggplot(data =  votedata %>% filter(year == 2016), aes(map_id = state)) +
  geom_map(aes(fill = demVote), map = states_map)+
  expand_limits(x = states_map$long, y = states_map$lat)
map_plot  

What if I want a map that shows which of the states are “south”? What do I change?

map_plot <- ggplot(data = votedata %>% filter(year == 2016), aes(map_id = state)) +
  geom_map(aes(fill = south), map = states_map) +
  expand_limits(x = states_map$long, y = states_map$lat) +
  labs(
    title = "Southern vs Non Southern States (2016)",
    fill = "Southern State"
  ) +
  theme_minimal()
  
map_plot

Some more dplyr practice

I want to know the average democratic vote for N vs S, by year.

First, find the average democratic votes for the north and the south, every year. You’ll need to do a double group_by() here. You do it in one call of the function.

library(dplyr)

avg_dem_votes <- votedata %>%
  group_by(year, south) %>%
  summarise(
    avg_dem_vote = mean(demVote, na.rm = TRUE),
    .groups = "drop"
  )

avg_dem_votes
## # A tibble: 44 × 3
##     year south avg_dem_vote
##    <int> <lgl>        <dbl>
##  1  1932 FALSE         56.7
##  2  1932 TRUE          83.4
##  3  1936 FALSE         59.2
##  4  1936 TRUE          83.2
##  5  1940 FALSE         52.8
##  6  1940 TRUE          80.9
##  7  1944 FALSE         51.1
##  8  1944 TRUE          75.1
##  9  1948 FALSE         50.2
## 10  1948 TRUE          45.9
## # ℹ 34 more rows

Then, let’s plot that! Pipe the result of your group_by and summarize to ggplot and geom_line(), with year on the x axis and your summarized value on the y axis. Color by the south variable.

votedata %>%
  group_by(year, south) %>%
  summarise(
    avg_dem_vote = mean(demVote, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = year, y = avg_dem_vote, color = south)) +
  geom_line(size = 1.2) +
  labs(
    title = "Average Democratic Vote by Region Time to Time",
    subtitle = "Comparing Southern and Non Southern States",
    x = "Year",
    y = "Average Democratic Vote (%)",
    color = "Southern State"
  ) +
  theme_minimal()

Layering plots!

Penguins!

library(palmerpenguins)
glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

We can use boxplots to visualize the distribution of weight (body_mass_g) within each species:

library(ggplot2)

ggplot(penguins, aes(x = species, y = body_mass_g, fill = species)) +
  geom_boxplot() +
  labs(
    title = "Distribution of Penguin Body Mass by Species",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

What if we also want the points? Layering!! Add a geom_point to your existing boxplot. geom_boxplot + geom_point!

ggplot(penguins, aes(x = species, y = body_mass_g, fill = species)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.6) +  
  geom_point(position = position_jitter(width = 0.2), alpha = 0.7) +
  labs(
    title = "Penguin Body Mass by Species",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

But, these are all stacked up… to actually see them, use “geom_jitter” instead of points

ggplot(penguins, aes(x = species, y = body_mass_g, fill = species)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.6) +
  geom_jitter(width = 0.2, alpha = 0.7) +
  labs(
    title = "Penguin Body Mass by Species",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

How to get the boxplots on top? The layers are plotted in the order you give them, so change to geom_point + geom_boxplot. (You might want to change the alpha on the boxplot to be able to see the plots under them)

ggplot(penguins, aes(x = species, y = body_mass_g, fill = species)) +
  geom_jitter(width = 0.2, alpha = 0.6) +  
  geom_boxplot(alpha = 0.3, outlier.shape = NA) +  
  labs(
    title = "Penguin Body Mass by Species",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Maybe let’s try replacing the boxplot with a geom_violin()?

ggplot(penguins, aes(x = species, y = body_mass_g, fill = species)) +
  geom_jitter(width = 0.2, alpha = 0.6) +  
  geom_violin(alpha = 0.4, color = NA) +   
  labs(
    title = "Penguin Body Mass by Species",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

If time: More Practice with Penguins

# 1 
library(dplyr)

penguins %>%
  count(species, sex)
## # A tibble: 8 × 3
##   species   sex        n
##   <fct>     <fct>  <int>
## 1 Adelie    female    73
## 2 Adelie    male      73
## 3 Adelie    <NA>       6
## 4 Chinstrap female    34
## 5 Chinstrap male      34
## 6 Gentoo    female    58
## 7 Gentoo    male      61
## 8 Gentoo    <NA>       5
# 2 
penguins %>%
  group_by(island) %>%
  summarise(avg_mass = mean(body_mass_g, na.rm = TRUE))
## # A tibble: 3 × 2
##   island    avg_mass
##   <fct>        <dbl>
## 1 Biscoe       4716.
## 2 Dream        3713.
## 3 Torgersen    3706.
# 3
penguins %>%
  group_by(sex) %>%
  summarise(avg_bill_length = mean(bill_length_mm, na.rm = TRUE))
## # A tibble: 3 × 2
##   sex    avg_bill_length
##   <fct>            <dbl>
## 1 female            42.1
## 2 male              45.9
## 3 <NA>              41.3
# 4
library(ggplot2)

penguins %>%
  filter(sex == "female") %>%
  ggplot(aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_point() +
  labs(
    title = "Bill Length vs Depth (Females Only)",
    x = "Bill Length (mm)",
    y = "Bill Depth (mm)"
  ) +
  theme_minimal()

# 5 
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
  geom_point() +
  facet_wrap(~ island) +
  labs(
    title = "Flipper Length vs Body Mass by Island",
    x = "Flipper Length (mm)",
    y = "Body Mass (g)",
    color = "Species"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# 6
ggplot(penguins, aes(x = flipper_length_mm, color = sex, fill = sex)) +
  geom_density(alpha = 0.4) +
  labs(
    title = "Density of Flipper Length by Sex",
    x = "Flipper Length (mm)",
    y = "Density"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

# 7
ggplot(penguins, aes(x = body_mass_g, color = factor(year), fill = factor(year))) +
  geom_density(alpha = 0.4) +
  labs(
    title = "Density of Body Mass by Year",
    x = "Body Mass (g)",
    y = "Density",
    fill = "Year",
    color = "Year"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).